MUS 15 Project 2: Audio Transcription

Test


Gino Prasad, Saba Heydari Seradj, Ashish Dalvi

05/27/2023


For our MUS 15 project, our group created an audio transcriber from scratch. All of the code was made from scratch by us using python.

We have uploaded this code to https://github.com/GinoP123/MusicTranscription

In [11]:
import torch
import torchaudio
from IPython.display import Audio
import numpy as np
import matplotlib.pyplot as plt
from scipy.fft import fft, ifft
import scipy.ndimage
import pandas as pd
import tqdm

print(torch.__version__)
print(torchaudio.__version__)
2.0.0.dev20221214
0.14.0.dev20221214

Loading Example Audio

In [12]:
audio_path = 'wav_files/audio1623001661.wav'
In [13]:
bruyeres_wav, sample_rate = torchaudio.load(audio_path)
bruyeres_wav = bruyeres_wav[0][19*sample_rate:-7*sample_rate]
In [14]:
Audio(bruyeres_wav, rate=sample_rate)
Out[14]:

Assigning Pitches to Frequencies

In [15]:
prelim_notes_octave = 4
prelim_notes = [("C", 261.63), ("C#", 277.18), ("D", 293.66), ("D#", 311.13),
         ("E", 329.63), ("F", 349.23), ("F#", 369.99), ("G", 392.00), ("G#", 415.30), ("A", 440.00), ("A#", 466.16), ("B", 493.88), ]
prelim_notes
Out[15]:
[('C', 261.63),
 ('C#', 277.18),
 ('D', 293.66),
 ('D#', 311.13),
 ('E', 329.63),
 ('F', 349.23),
 ('F#', 369.99),
 ('G', 392.0),
 ('G#', 415.3),
 ('A', 440.0),
 ('A#', 466.16),
 ('B', 493.88)]
In [16]:
notes, names = zip(*[(freq * (2 ** (octave - prelim_notes_octave)), f"{name}{octave}") for octave in range(8) for name, freq in prelim_notes])
notes = pd.DataFrame(notes, index=names, columns=['frequency'])
notes
Out[16]:
frequency
C0 16.351875
C#0 17.323750
D0 18.353750
D#0 19.445625
E0 20.601875
... ...
G7 3136.000000
G#7 3322.400000
A7 3520.000000
A#7 3729.280000
B7 3951.040000

96 rows × 1 columns

Setting the hyperparameters

In [17]:
window_size = 2 ** 13
hop_size = window_size // 4
In [18]:
# # Gaussian Kernel Parameters
# gkernel_length = 21
# gkernel_std = 1
In [19]:
# gaussian_scale_percentile = 99
# radius = 3
In [20]:
min_amplitude_threshold = 50
overtone_comb_limit = 3 # highest (c * frequency) computed for comb
overtone_dissipation_rate = 0.02
overtone_kernel = np.array([min_amplitude_threshold * (overtone_dissipation_rate ** i) for i in range(overtone_comb_limit)])
overtone_kernel
Out[20]:
array([5.e+01, 1.e+00, 2.e-02])
In [21]:
reasonable_notes = ('C0', 'C6')
In [22]:
reasonable_notes = ('C3', 'C7')
mask = notes['frequency'] >= notes.loc[reasonable_notes[0]]['frequency']
mask *= notes['frequency'] <= notes.loc[reasonable_notes[1]]['frequency']
reasonable_notes = notes[mask]
reasonable_notes.head()
Out[22]:
frequency
C3 130.815
C#3 138.590
D3 146.830
D#3 155.565
E3 164.815
In [23]:
amplitude = 1
def get_frequency(note=None, frequency=None, length=window_size/sample_rate):
    assert note is not None or frequency is not None
    if frequency is None:
        frequency = notes.loc[note]['frequency']
    base = np.arange(0, length*sample_rate).astype(np.float64)
    c = (frequency * 2 * np.pi) / sample_rate
    wavelet_ = amplitude * np.sin(c * base)
    return wavelet_
In [24]:
Audio(get_frequency(note='C5', length=5), rate=sample_rate)
Out[24]:

Predicting Frequencies

In [25]:
def get_amplitude(window, note=None, frequency=None, return_phase=False):
    assert note is not None or frequency is not None
    if frequency is None:
        frequency = notes.loc[note].frequency
    c = (frequency * 2 * np.pi) / sample_rate
    sin_coeff, cos_coeff = [np.dot(func(c * np.arange(len(window))), window) for func in (np.sin, np.cos)]
    term = cos_coeff + (1j*sin_coeff)
    if return_phase:
        return np.absolute(term), np.angle(term)
    return np.absolute(term)
In [26]:
def get_combs(window):
    cache, combs = {}, {}
    for note, fundamental_frequency in zip(notes.index, notes.frequency):
        combs[note] = []
        for overtone in range(1, overtone_comb_limit+1):
            frequency = fundamental_frequency * overtone
            if frequency not in cache:
                cache[frequency] = get_amplitude(window, frequency=frequency)
            combs[note].append(cache[frequency])
        combs[note] = np.array(combs[note])
    return combs
In [27]:
def predict_frequencies_window(window):
    predicted = []
    for note, comb in get_combs(window).items():
        if (comb > overtone_kernel).all():
            predicted.append(note)
    return predicted
In [41]:
def predict_frequencies(wav):
    N = len(wav)
    predicted = []
    for start in tqdm.tqdm(range(0, N, hop_size)):
        if N - start < window_size:
            break
        window = wav[start:start+window_size].numpy()
        predicted.append(predict_frequencies_window(window))
    return predicted

Example Window Clip

In [42]:
clip = bruyeres_wav[int(sample_rate*(0.5)):int(sample_rate*(0.5)+window_size)].numpy()
In [43]:
Audio(clip, rate=sample_rate)
Out[43]:
In [44]:
predict_frequencies_window(clip)
Out[44]:
['D#5']
In [45]:
get_amplitude(clip, 'D#5')
Out[45]:
139.77996524110867
In [46]:
Audio(get_frequency(note='D#5', length=5), rate=sample_rate)
Out[46]:

Hann Window

In [47]:
def hann_window():
    return (1 - np.cos(2 * np.pi * np.arange(window_size) / window_size)) / (window_size / hop_size)
hann_window = hann_window()
In [48]:
plt.plot(hann_window)
Out[48]:
[<matplotlib.lines.Line2D at 0x1743c5430>]
In [49]:
extended_window = np.concatenate([hann_window] * 16)
composite = extended_window.copy()
composite[:-hop_size] += extended_window[hop_size:]
composite[:-2*hop_size] += extended_window[2*hop_size:]
composite[:-3*hop_size] += extended_window[3*hop_size:]
plt.plot(composite)
Out[49]:
[<matplotlib.lines.Line2D at 0x295f1cb50>]

Reconstructing Predicted Frequency

In [84]:
def reconstruct(predicted, freq=False):
    output_length = (len(predicted) - 1) * hop_size + window_size
    composite = np.zeros(output_length)
    for i, window in enumerate(predicted):
        base = np.zeros(window_size)
        for note in window:
            if not freq:
                frequency = notes.loc[note].frequency
            else:
                frequency = note
            for overtone in range(1,overtone_comb_limit+1):
                base += get_frequency(frequency=frequency*overtone*(overtone_dissipation_rate **(overtone-1)))
        composite[i*hop_size:(i*hop_size)+window_size] += base * hann_window
    return composite
In [57]:
Audio(reconstruct([['A5'] for _ in range(500)]), rate=sample_rate)
Out[57]:
In [58]:
Audio(get_frequency(note='A5', length=20), rate=sample_rate)
Out[58]:

Reconstructing Entire Song

In [79]:
# predicted_frequencies = predict_frequencies(bruyeres_wav)
In [80]:
Audio(reconstruct(predicted_frequencies), rate=sample_rate)
Out[80]:
In [86]:
mask = np.load('/Users/ginoprasad/Downloads/midi_mask.npy')
In [96]:
from midi2audio import FluidSynth
In [97]:
FluidSynth().play_midi('/Users/ginoprasad/Downloads/MIDI-Unprocessed_041_PIANO041_MID--AUDIO-split_07-06-17_Piano-e_1-01_wav--1.midi')
fluidsynth: error: fluid_is_soundfont(): fopen() failed: 'File does not exist.'
Parameter '/Users/ginoprasad/.fluidsynth/default_sound_font.sf2' not a SoundFont or MIDI file or error occurred identifying it.
fluidsynth: error: fluid_sfloader_load(): Failed to open '/opt/homebrew/Cellar/fluid-synth/2.3.0/share/soundfonts/default.sf2': File does not exist.
fluidsynth: error: Unable to open file '/opt/homebrew/Cellar/fluid-synth/2.3.0/share/soundfonts/default.sf2'
fluidsynth: error: Failed to load SoundFont "/opt/homebrew/Cellar/fluid-synth/2.3.0/share/soundfonts/default.sf2"
fluidsynth: warning: No preset found on channel 0 [bank=0 prog=0]

KeyboardInterrupt

In [100]:
frequencies = np.array([400 * (2 ** ((n-69)/12)) for n in range(128)])
def reconstruct_mask(mask):
    output_length = (len(mask) - 1) * hop_size + window_size
    composite = np.zeros(output_length)
    for i, window in enumerate(mask):
        base = np.zeros(hop_size)
        for amp, frequency in zip(window, frequencies):
            if amp != 0:
                for overtone in range(1,overtone_comb_limit+1):
                    base += amp * get_frequency(frequency=frequency*overtone*(overtone_dissipation_rate **(overtone-1)), length=hop_size/sample_rate)
        composite[i*hop_size:(i*hop_size)+hop_size] += base
    return composite
In [101]:
Audio(reconstruct_mask(mask), rate=sample_rate)
Out[101]:
In [93]:
hop_size / sample_rate
Out[93]:
0.064
In [60]:
from matplotlib.pyplot import imshow
In [64]:
mask.shape
Out[64]:
(3021, 128)
In [ ]:
 
In [ ]: